Bengisu Öniz Mustafa Tilkat Gökhan Şahin Ahmet Tunçel
The main purpose of this project is to predict the department-wide sales for each store for the following year. Moreover, anaylzing the effects of markdowns on holiday and finding meaningful insights are objectives of this Project.
We have found a data set from Kaggle. The data set named Retail Data Analytics which is about one of the retail company’s sales.
There are historical sales data for 45 stores located in different regions - each store contains a number of departments. The company also runs several promotional markdown events throughout the year. These markdowns precede prominent holidays, the four largest of which are the.Super Bowl, Labor Day, Thanksgiving, and Christmas. The weeks including these holidays are weighted five times higher in the evaluation than non-holiday weeks. Within the Excel Sheet, there are 3 Tabs – Stores, Features and Sales.
Stores Anonymized information about the 45 stores, indicating the type and size of store.
Features Contains additional data related to the store, department, and regional activity for the given dates. Store - the store number Date - the week Temperature - average temperature in the region Fuel_Price - cost of fuel in the region MarkDown1-5 - anonymized data related to promotional markdowns. Mark Down data is only available after Nov 2011, and is not available for all stores all the time. Any missing value is marked with an NA CPI - the consumer price index Unemployment - the unemployment rate *Is Holiday -whether the week is a special holiday week
Sales Historical sales data, which covers to 2010-02-05 to 2012-11-01. Within this tab there are the following fields: Store - the store number Dept - the department number Date - the week Weekly_Sales - sales for the given department in the given store *IsHoliday - whether the week is a special holiday week’[1]
library(lubridate)
library(ggplot2)
library(tidyverse)
library(data.table)
library(lubridate)
library(stringr)
library(ggplot2)
library(plotly)
library(corrplot)
features_data_set <- read.csv2("Features data set.csv", header = TRUE, sep = ",")
sales_data_set <- read.csv2("sales data-set.csv", header = TRUE, sep = ",")
stores_data_set <- read.csv2("stores data-set.csv", header = TRUE, sep = ",")
str(features_data_set)
## 'data.frame': 8190 obs. of 12 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Factor w/ 182 levels "01/02/2013","01/03/2013",..: 25 67 109 151 26 68 110 152 8 50 ...
## $ Temperature : Factor w/ 4178 levels "-2.06","-6.08",..: 1066 864 934 1308 1300 1998 1797 1601 2315 2579 ...
## $ Fuel_Price : Factor w/ 1011 levels "2.472","2.513",..: 16 10 3 12 43 58 88 95 87 121 ...
## $ MarkDown1 : Factor w/ 4023 levels "-16.93","-2781.45",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown2 : Factor w/ 2715 levels "-0.01","-0.05",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown3 : Factor w/ 2885 levels "-0.2","-0.73",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown4 : Factor w/ 3405 levels "0.22","0.41",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown5 : Factor w/ 4045 levels "-185.17","-37.02",..: NA NA NA NA NA NA NA NA NA NA ...
## $ CPI : Factor w/ 2505 levels "126.064","126.0766452",..: 1124 1143 1148 1150 1154 1157 1139 1115 1096 1084 ...
## $ Unemployment: Factor w/ 404 levels "10.064","10.115",..: 286 286 286 286 286 286 286 286 252 252 ...
## $ IsHoliday : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
str(sales_data_set)
## 'data.frame': 421570 obs. of 5 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Factor w/ 143 levels "01/04/2011","01/06/2012",..: 20 53 86 119 21 54 87 120 6 39 ...
## $ Weekly_Sales: Factor w/ 359464 levels "-0.02","-0.04",..: 140272 237401 220814 101950 120302 114885 122532 148007 272542 225936 ...
## $ IsHoliday : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
str(stores_data_set)
## 'data.frame': 45 obs. of 3 variables:
## $ Store: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Type : Factor w/ 3 levels "A","B","C": 1 1 2 1 2 1 2 1 2 2 ...
## $ Size : int 151315 202307 37392 205863 34875 202505 70713 155078 125833 126512 ...
features_data_set$Year <- substr(features_data_set$Date, 7, 10)
features_data_set$Month <- substr(features_data_set$Date, 4, 5)
features_data_set$Day <- substr(features_data_set$Date, 1, 2)
sales_data_set$Year <- substr(sales_data_set$Date, 7, 10)
sales_data_set$Month <- substr(sales_data_set$Date, 4, 5)
sales_data_set$Day <- substr(sales_data_set$Date, 1, 2)
sales_data_set$Weekly_Sales <- as.character(sales_data_set$Weekly_Sales)
sales_data_set$Weekly_Sales <- as.numeric(sales_data_set$Weekly_Sales,2)
ggplot(stores_data_set, aes(Type, fill = Type ) ) +
geom_bar() +
xlab("Type of Store") + ylab("Count of Store")
YearSales <- sales_data_set %>% group_by(Year) %>% summarise(YearSales = sum(Weekly_Sales))
ggplot(YearSales, aes(Year, YearSales)) +
geom_col()
SalesStore <- left_join(sales_data_set, stores_data_set, by = "Store")
ggplot(SalesStore, aes(Type, Size) ,log = "xy") +
geom_point()
plot(SalesStore$Size,SalesStore$Weekly_Sales, main = "Size vs Sales", xlab = "Store Size", ylab = "Weekly Sales")
SalesStore <- left_join(sales_data_set, stores_data_set, by = "Store")
monthsales<-SalesStore %>% group_by(Month) %>% summarise(montlysales=sum(Weekly_Sales))
monthsales$montlysales <- as.numeric(monthsales$montlysales)
qplot(x =Month , y = montlysales,data = monthsales)
deptSalesdata <- sales_data_set %>% group_by(Dept) %>% summarise(deptSales = sum(Weekly_Sales)) %>% arrange(desc(deptSales))
deptSalesdata$Dept<-as.factor(deptSalesdata$Dept)
deptSalesdata<-data.frame(deptSalesdata)
ggplot(deptSalesdata,aes(x=Dept,y=deptSales,fill=Dept)) +geom_bar(fill="#56B4E6", stat = "identity") + scale_x_discrete(name="Departments") + theme( axis.text.x = element_text(angle =90)) + ggtitle('Sales of the Departments')
features_data_set$Temperature<-as.numeric(as.vector(features_data_set$Temperature))
features_data_set$Unemployment<-as.numeric(as.vector(features_data_set$Unemployment))
features_data_set$Fuel_Price<-as.numeric(as.vector(features_data_set$Fuel_Price))
sales_data_set$Weekly_Sales<-as.numeric(as.vector(sales_data_set$Weekly_Sales))
features_temp_m <- features_data_set %>% group_by(Month) %>% summarise(ort_temp=mean(Temperature))
sales_m <- sales_data_set %>%group_by(Month) %>% summarise(ort_sa=mean(Weekly_Sales))
temp_sales <- inner_join(sales_m,features_temp_m,by="Month")
ggplot(temp_sales, aes(x = Month, y = ort_temp, size = ort_sa)) +
geom_point(shape = 21,colour = "#000000", fill = "#40b8d0")
features_Unem_m <- features_data_set %>% group_by(Month) %>% summarise(avg_une=mean(Unemployment))
Unem_sales <- inner_join(sales_m,features_Unem_m,by="Month")
ggplot(Unem_sales, aes(x = Month, y = avg_une, size = ort_sa)) +
geom_point(shape = 21,colour = "#000000", fill = "#40b8d0")
Looking at the percentages of the montly sales by the stores
Sales<-data.table(sales_data_set)
Features<-data.table(features_data_set)
Stores<-data.table(stores_data_set)
Sales<-Sales[,list(Store,Dept,Date,Weekly_Sales)]
setkey(Sales,Store,Date)
setkey(Features,Store,Date)
Sales<-Features[Sales]
setkey(Sales,Store)
setkey(Stores,Store)
Sales<-Stores[Sales]
str(Sales)
## Classes 'data.table' and 'data.frame': 421570 obs. of 19 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Type : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
## $ Size : int 151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
## $ Date : Factor w/ 143 levels "01/04/2011","01/06/2012",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Temperature : num 59.2 59.2 59.2 59.2 59.2 ...
## $ Fuel_Price : num 3.52 3.52 3.52 3.52 3.52 ...
## $ MarkDown1 : Factor w/ 4023 levels "-16.93","-2781.45",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown2 : Factor w/ 2715 levels "-0.01","-0.05",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown3 : Factor w/ 2885 levels "-0.2","-0.73",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown4 : Factor w/ 3405 levels "0.22","0.41",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown5 : Factor w/ 4045 levels "-185.17","-37.02",..: NA NA NA NA NA NA NA NA NA NA ...
## $ CPI : Factor w/ 2505 levels "126.064","126.0766452",..: 1524 1524 1524 1524 1524 1524 1524 1524 1524 1524 ...
## $ Unemployment: num 7.68 7.68 7.68 7.68 7.68 ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Year : chr "2011" "2011" "2011" "2011" ...
## $ Month : chr "04" "04" "04" "04" ...
## $ Day : chr "01" "01" "01" "01" ...
## $ Dept : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Weekly_Sales: num 20398 46992 8734 34452 23599 ...
## - attr(*, "sorted")= chr "Store"
## - attr(*, ".internal.selfref")=<externalptr>
summary(Sales)
## Store Type Size Date
## Min. : 1.0 A:215478 Min. : 34875 23/12/2011: 3027
## 1st Qu.:11.0 B:163495 1st Qu.: 93638 25/11/2011: 3021
## Median :22.0 C: 42597 Median :140167 16/12/2011: 3013
## Mean :22.2 Mean :136728 09/12/2011: 3010
## 3rd Qu.:33.0 3rd Qu.:202505 17/02/2012: 3007
## Max. :45.0 Max. :219622 30/12/2011: 3003
## (Other) :403489
## Temperature Fuel_Price MarkDown1 MarkDown2
## Min. : -2.06 Min. :2.472 1.5 : 102 1.91 : 539
## 1st Qu.: 46.68 1st Qu.:2.933 460.73 : 102 3 : 493
## Median : 62.09 Median :3.452 175.64 : 93 0.5 : 485
## Mean : 60.09 Mean :3.361 1282.42: 75 1.5 : 471
## 3rd Qu.: 74.28 3rd Qu.:3.738 1483.17: 75 4 : 367
## Max. :100.14 Max. :4.468 (Other):150234 (Other):108893
## NA's :270889 NA's :310322
## MarkDown3 MarkDown4 MarkDown5 CPI
## 3 : 754 9 : 280 2743.18 : 136 129.8555333: 711
## 6 : 710 4 : 200 1064.56 : 120 131.1083333: 708
## 2 : 660 2 : 197 20371.02: 75 129.8459667: 707
## 1 : 611 3 : 146 3557.67 : 75 130.3849032: 706
## 0.22 : 487 47 : 143 3567.03 : 75 130.6457931: 706
## (Other):133869 (Other):134001 (Other) :150951 130.683 : 706
## NA's :284479 NA's :286603 NA's :270138 (Other) :417326
## Unemployment IsHoliday Year Month
## Min. : 3.879 Mode :logical Length:421570 Length:421570
## 1st Qu.: 6.891 FALSE:391909 Class :character Class :character
## Median : 7.866 TRUE :29661 Mode :character Mode :character
## Mean : 7.960
## 3rd Qu.: 8.572
## Max. :14.313
##
## Day Dept Weekly_Sales
## Length:421570 Min. : 1.00 Min. : -4989
## Class :character 1st Qu.:18.00 1st Qu.: 2080
## Mode :character Median :37.00 Median : 7612
## Mean :44.26 Mean : 15981
## 3rd Qu.:74.00 3rd Qu.: 20206
## Max. :99.00 Max. :693099
##
Sales[,Month:=as.numeric(substring(as.character(Date),4,5))]
Sales$Date<-dmy(Sales$Date)
MonthlySales<-Sales[,sum(Weekly_Sales,na.rm = TRUE),.(Store,Month)]
setnames(MonthlySales,"V1","Monthly_Sales")
MonthlySales[,TotalSales:=sum(Monthly_Sales,na.rm = TRUE),.(Month)]
MonthlySales[,SalesPercantage:=Monthly_Sales*1.0/TotalSales]
Clusno<-5
CM=dcast.data.table(MonthlySales,Store~Month,value.var="SalesPercantage")
S<-colnames(CM)
CM<-data.frame(CM)
CM[is.na(CM)]=0
colnames(CM)<-S
CM<-data.table(CM)
# basl<-which(colnames(rr)=="2")
# bitis<-which(colnames(rr)=="269")
set.seed(7)
CM[,clusno:=kmeans(CM[,c(2:ncol(CM)),with=F],Clusno)$cluster]
clusters<-CM[,list(Store,clusno)]
setkey(clusters,Store)
setkey(MonthlySales,Store)
MonthlySales<-clusters[MonthlySales]
SalesP<-dcast.data.table(MonthlySales,Month~Store,value.var="SalesPercantage")
MonthlySales$Month <- factor(MonthlySales$Month)
MonthlySales$Store <- factor(MonthlySales$Store)
MonthlySales$clusno <- factor(MonthlySales$clusno)
# plotting reference lines across each facet:
referenceLines <- MonthlySales # \/ Rename
colnames(referenceLines)[2] <- "groupVar"
zp <- ggplot(MonthlySales,
aes(x = Month, y = SalesPercantage))
zp <- zp + geom_line(data = referenceLines, # Plotting the "underlayer"
aes(x = Month, y = SalesPercantage, group = groupVar),
colour = "GRAY", alpha = 1/2, size = 1/2)
zp <- zp + geom_line(size = 1) # Drawing the "overlayer"
zp <- zp + facet_wrap(~ Store)
zp <- zp + theme_bw()
ggplotly()
ggplot(MonthlySales, aes(x=Month, y=SalesPercantage, color=clusno, group=Store)) +
geom_line()
ggplotly()
YearlySales<-Sales[,sum(Weekly_Sales,na.rm = TRUE),.(Store,Type,Size)]
setnames(YearlySales,"V1","Yearly_Sales")
ggplot(YearlySales,aes(x=Size,y=Yearly_Sales)) +
geom_point()+
geom_smooth(method=lm,color="RED",se = FALSE)+
scale_x_continuous(waiver()) + scale_y_continuous(waiver())
daysales<-Sales %>%
group_by(Day) %>%
summarise(Salesofthedays=sum(Weekly_Sales))
daysales$Day<-as.factor(daysales$Day)
ggplot(daysales,aes(x=Day,y=Salesofthedays,fill=Day)) +geom_bar(fill="#FF6666", stat = "identity") + scale_x_discrete(name="Days") + theme( axis.text.x = element_text(angle =90)) + ggtitle('Sales of the Days')
alldata <- inner_join(SalesStore, features_data_set , by = c("Store", "Year", "Month", "Day"))
selectcol <- alldata %>% select( Size, CPI ,Unemployment ,Fuel_Price, Weekly_Sales, Temperature)
selectcol$CPI <- as.numeric(as.character(selectcol$CPI))
selectcol$Unemployment <- as.numeric(as.character(selectcol$Unemployment))
selectcol$Fuel_Price <- as.numeric(as.character(selectcol$Fuel_Price))
selectcol$Weekly_Sales <- as.numeric(as.character(selectcol$Weekly_Sales))
selectcol$Temperature <- as.numeric(as.character(selectcol$Temperature))
matrixdata <- as.matrix(selectcol)
corrplot(cor(selectcol) ,method = "circle")
###References #_Retail Data Analytics. (2017, August). Retrieved from https://www.kaggle.com/manjeetsingh/retaildataset_